/**
 * Project Name:webcrawler-web
 * File Name:ListLinks.java
 * Package Name:com.zongtui.web.common.links
 * Date:2015-5-4下午6:40:12
 * Copyright (c) 2015, 众推项目组版权所有.
 *
 */

package com.zongtui.links;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * ClassName: ListLinks <br/>
 * Function: 取页面中的所有链接、图片和其它辅助内容。并检查URLs和文本信息。 . <br/>
 * date: 2015-5-4 下午6:40:12 <br/>
 * 
 * @author feng
 * @version
 * @since JDK 1.7
 */
public class ListLinks {

	public List<String> getLinks(String url) throws IOException {

		Document doc = Jsoup.connect(url).get();
		Elements links = doc.select("a[href]");
		// Elements media = doc.select("[src]");
		// Elements imports = doc.select("link[href]");

		// print("\nMedia: (%d)", media.size());
		// for (Element src : media) {
		// if (src.tagName().equals("img"))
		// print(" * %s: <%s> %sx%s (%s)", src.tagName(),
		// src.attr("abs:src"), src.attr("width"),
		// src.attr("height"), trim(src.attr("alt"), 20));
		// else
		// print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
		// }

		// print("\nImports: (%d)", imports.size());
		// for (Element link : imports) {
		// print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"),
		// link.attr("rel"));
		// }

		List<String> linkList = new ArrayList<String>();

		print("\nLinks: (%d)", links.size());
		for (Element link : links) {
			print(" * a: <%s>  (%s)", link.attr("abs:href"),
					trim(link.text(), 35));
			linkList.add(link.attr("abs:href"));
		}
		return linkList;
	}

	public static void main(String[] args) throws IOException {
		String url = "http://www.open-open.com/jsoup/";
		ListLinks ll = new ListLinks();
		ll.getLinks(url);
	}

	private static void print(String msg, Object... args) {
		System.out.println(String.format(msg, args));
	}

	private static String trim(String s, int width) {
		if (s.length() > width)
			return s.substring(0, width - 1) + ".";
		else
			return s;
	}

}
